/hps/nobackup/birney/users/ian/hmn_fstlibrary(here)
source(here::here("code/scripts/20210628_source.R"))NOTE: only run once.
date_of_collection = "20210809"
long_term_dir = "/nfs/research/birney/users/ian/hmn_fst/gwasrapidd"
out_file = here::here("code/snakemake/20210625/config",
paste(date_of_collection, "_all_traits.tsv", sep = ""))
# Get list of all traits in database
all_traits = gwasrapidd::get_traits()
# Filter out IDs that cause an error
all_traits_tbl = all_traits@traits %>%
dplyr::filter(efo_id != "CHEBI:7916") %>%
# Save to file
readr::write_tsv(out_file)all_traits_tbl %>%
DT::datatable(.)NOTE: only run once.
#Need to do them sequentially because the GWAS Catalog can't handle dozens (let alone thousands) of simultaneous queries.
date_of_collection = "20210809"
long_term_dir = "/nfs/research/birney/users/ian/hmn_fst/gwasrapidd"
# create output dir
out_dir = file.path(long_term_dir,
date_of_collection,
"associations_raw")
dir.create(out_dir, recursive = T)
# Get associations
counter = 0
lapply(all_traits_tbl$efo_id, function(EFO_ID) {
counter <<- counter + 1
# set output file name
out_path = file.path(out_dir,
paste(EFO_ID, ".rds", sep = ""))
# if output file doesn't already exist, get associations and save
if (!file.exists(out_path)){
out = gwasrapidd::get_associations(efo_id = EFO_ID)
saveRDS(out, out_path)
} else {
print(paste(counter, ". ", EFO_ID, ": File already exists.", sep = ""))
}
})Full Snakemake pipeline here: https://github.com/brettellebi/human_traits_fst/tree/master/code/snakemake/20210625
target_dir = "/nfs/research/birney/users/ian/hmn_fst/gwasrapidd/20210809"
clump_params = "0.1_1000"assocs_path = file.path(target_dir, "associations_raw")
assocs_files = list.files(assocs_path, full.names = T)
# Read in associations
assocs_raw = lapply(assocs_files, function(EFO_ID){
readRDS(EFO_ID)
})
names(assocs_raw) = basename(assocs_files) %>%
stringr::str_remove(".rds")
# Get counts
raw_counts = purrr::map(assocs_raw, function(EFO_ID){
EFO_ID@risk_alleles %>%
dplyr::pull(variant_id) %>%
unique(.) %>%
length(.) %>%
data.frame("N" = .)
}) %>%
dplyr::bind_rows(.id = "EFO_ID") %>%
# Add `trait`
dplyr::left_join(all_traits_tbl %>%
dplyr::select(efo_id, trait),
by = c("EFO_ID" = "efo_id")) %>%
# Rename to `TRAIT`
dplyr::rename(TRAIT = trait) %>%
# Remove all traits with N < 1
dplyr::filter(N >= 1)
raw_counts %>%
dplyr::arrange(dplyr::desc(N)) %>%
DT::datatable(.)Set minimum number of “independent” (i.e. post-clumped) SNPs per trait
min_snps = 10hits_path = file.path(target_dir, "pegas/fst/consol", clump_params, "hits.rds")
hits = readRDS(hits_path) %>%
lapply(., function(TRAIT_ID){
# convert to DF
TRAIT_ID %>%
data.frame(.) %>%
tibble::rownames_to_column(var = "SNP") %>%
dplyr::select(SNP, FST_PEGAS = Fst)
}) %>%
dplyr::bind_rows(.id = "EFO_ID") %>%
dplyr::left_join(all_traits_tbl %>%
dplyr::select(EFO_ID = efo_id,
TRAIT = trait),
by = "EFO_ID")
# Get counts
hits_counts = hits %>%
dplyr::count(EFO_ID, TRAIT) %>%
dplyr::rename(N = n)
hits_counts %>%
dplyr::arrange(dplyr::desc(N)) %>%
DT::datatable(.)# Get vector of EFO_IDs with more than `min_snps`
keep_snps = hits %>%
dplyr::count(EFO_ID) %>%
dplyr::filter(n >= min_snps) %>%
dplyr::pull(EFO_ID)
# How many traits pass threshold
length(keep_snps)## [1] 587
# Filter hits for those traits
hits_filt = hits %>%
dplyr::filter(EFO_ID %in% keep_snps)NOTE one SNP ID can map to multiple loci, so there are duplicate SNP IDs with different Fst measures.
In the code below we remove all the duplicated SNPs which have periods in their names, e.g. {SNP_ID}.1 or {SNP_ID}.2.
That is, we keep the first SNP in the genome with the ID. This would bias toward SNPs “earlier” in the genome. Probably not consequential?
# Fst controls
controls_path = file.path(target_dir, "pegas/fst/consol", clump_params, "controls.rds")
controls = readRDS(controls_path) %>%
lapply(., function(TRAIT_ID){
# convert to DF
TRAIT_ID %>%
data.frame(.) %>%
tibble::rownames_to_column(var = "SNP") %>%
dplyr::select(SNP, FST_PEGAS = Fst)
}) %>%
dplyr::bind_rows(.id = "EFO_ID") %>%
dplyr::left_join(all_traits_tbl %>%
dplyr::select(EFO_ID = efo_id,
TRAIT = trait),
by = "EFO_ID") %>%
# NOTE: remove duplicated IDs
dplyr::filter(!grepl("\\.", SNP))
# Filter controls for traits with more than `min_snps` post-clumping
controls_filt = controls %>%
dplyr::filter(EFO_ID %in% keep_snps).clumped filesclumped_files = list.files(file.path(target_dir, "high_cov/plink/clumped"), pattern = "*.clumped", full.names = T)
clumped = lapply(clumped_files, function(FILE){
# note: version 2 doesn't parse properly, so we need to use version 1
readr::with_edition(1, readr::read_delim(FILE,
delim = " ",
col_types = "i-c-d-------",
trim_ws = T)
)
})
names(clumped) = stringr::str_remove(basename(clumped_files), ".clumped")pal_all = turbo(n = nrow(raw_counts))
names(pal_all) = raw_counts %>%
dplyr::arrange(desc(N)) %>%
dplyr::pull(TRAIT)snp_count_plot = list("PRE-CLUMP" = raw_counts,
"POST-CLUMP" = hits_counts) %>%
dplyr::bind_rows(.id = "FILTER") %>%
# order variables
dplyr::mutate(TRAIT = factor(TRAIT, levels = names(pal_all)),
FILTER = factor(FILTER, levels = c("PRE-CLUMP", "POST-CLUMP"))) %>%
ggplot() +
geom_col(aes(TRAIT, N, fill = TRAIT)) +
scale_fill_manual(values = pal_all) +
theme_bw() +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.ticks.x = element_blank()) +
ylab("N LOCI") +
facet_wrap(~FILTER, nrow = 2) +
guides(fill = "none")
plotly::ggplotly(snp_count_plot,
tooltip = c("TRAIT"))width = 20
height = 10
dir.create(here::here("docs/plots", clump_params), recursive = T, showWarnings = T)
ggsave(here::here("docs/plots", clump_params, "20220314_snp_counts.png"),
plot = snp_count_plot,
device = "png",
width = width,
height = height,
units = "in",
dpi= 400)
ggsave(here::here("docs/plots", clump_params, "20220314_snp_counts.svg"),
plot = snp_count_plot,
device = "svg",
width = width,
height = height,
units = "in")Ranking by median Fst doesn’t capture the difference between pigmentation/HIV traits, and all others.
This difference is captured by the D statistic, but we would want a direction – i.e. do the hits have higher or lower \(F_{ST}\) than the controls?
Then we want to rank by that directional difference.
# Create full data frame of hits and controls
hits_v_controls_full = list("HIT" = hits_filt,
"CONTROL" = controls_filt) %>%
dplyr::bind_rows(.id = "TYPE") %>%
# create column for palette
dplyr::mutate(PAL_TRAIT = dplyr::if_else(TYPE == "HIT",
TRAIT,
"control"))# Run KS.test
# Split into list by trait
ks_list_full = hits_v_controls_full %>%
split(., f = .$TRAIT) %>%
purrr::map(., function(TRAIT){
split(TRAIT, f = TRAIT$TYPE)
})
# `Run ks.test`
ks_out_full = purrr::map(ks_list_full, function(TRAIT){
# Create output list
out = list()
# Run KS tests
out[["two.sided"]] = ks.test(TRAIT$HIT$FST_PEGAS,
TRAIT$CONTROL$FST_PEGAS,
alternative = "two.sided")
out[["less"]] = ks.test(TRAIT$HIT$FST_PEGAS,
TRAIT$CONTROL$FST_PEGAS,
alternative = "less")
out[["greater"]] = ks.test(TRAIT$HIT$FST_PEGAS,
TRAIT$CONTROL$FST_PEGAS,
alternative = "greater")
return(out)
})
# Pull out stats
ks_stats_full = purrr::map_dfr(ks_out_full, function(TRAIT){
tibble::tibble(D_TWO = TRAIT$two.sided$statistic,
D_LESS = TRAIT$less$statistic,
D_GREATER = TRAIT$greater$statistic)
}, .id = "TRAIT")The alternative = "two.sided" result is just the highest D of either “less” or “greater”.
ks_stats_full %>%
dplyr::arrange(desc(D_TWO)) %>%
DT::datatable(.)The documentation for ks.test says: >… in the two-sample case alternative = “greater” includes distributions for which x is stochastically smaller than y (the CDF of x lies above and hence to the left of that for y)…
So for traits with hits that have a lower \(F_{ST}\) than their controls will have a higher D under the "greater" test than the "less" test, e.g. body mass index.
Give those a negative sign:
ks_stats_full = ks_stats_full %>%
dplyr::mutate(D_SIGNED = dplyr::case_when(D_GREATER > D_LESS ~ -D_TWO,
D_GREATER < D_LESS ~ D_TWO,
D_GREATER == D_LESS ~ 0)) %>%
dplyr::arrange(D_SIGNED)
ks_stats_full %>%
DT::datatable(.)traits_d_ranked = ks_stats_full$TRAIT
d_rank_pal = viridisLite::turbo(n = length(traits_d_ranked))
names(d_rank_pal) = traits_d_rankedecdf_plot_face_w_controls_d_rank = hits_v_controls_full %>%
# take unique SNPs
dplyr::group_by(EFO_ID) %>%
dplyr::distinct(SNP, .keep_all = T) %>%
dplyr::ungroup() %>%
# order FST_PEGAS so that it's properly plotted by plotly
dplyr::arrange(EFO_ID, FST_PEGAS) %>%
# order
dplyr::mutate(TRAIT = factor(TRAIT, levels = names(d_rank_pal)),
PAL_TRAIT = factor(PAL_TRAIT, levels = names(d_rank_pal))) %>%
ggplot() +
stat_ecdf(aes(FST_PEGAS, colour = PAL_TRAIT)) +
scale_colour_manual(values = d_rank_pal) +
theme_cowplot(rel_small = 9/14) +
guides(colour = "none") +
theme(strip.text.x = element_text(size = 5)) +
xlab(expression(italic(F[ST]))) +
ylab("Cumulative Probability") +
facet_wrap(~TRAIT)
out_path = here::here("docs/plots", clump_params, "20220314_ecdf_all_faceted_with_controls_d_rank.png")
dir.create(dirname(out_path), recursive = T, showWarnings = F)
ggsave(out_path,
plot = ecdf_plot_face_w_controls_d_rank,
device = "png",
width = 30,
height = 22,
units = "in",
dpi = 400)## Warning: Removed 154 rows containing non-finite values (stat_ecdf).
knitr::include_graphics(out_path)
plotly_ecdf_all_d_ranked = hits_filt %>%
# order FST_PEGAS so that it's properly plotted by plotly
dplyr::arrange(EFO_ID, FST_PEGAS) %>%
# order
dplyr::mutate(TRAIT = factor(TRAIT, levels = names(d_rank_pal))) %>%
ggplot() +
stat_ecdf(aes(FST_PEGAS, colour = TRAIT)) +
scale_colour_manual(values = d_rank_pal) +
theme_bw() +
ggtitle("eCDF") +
guides(colour = "none") +
theme(strip.text.x = element_text(size = 5))
plotly::ggplotly(plotly_ecdf_all_d_ranked,
tooltip = c("TRAIT"))## Warning: Removed 5 rows containing non-finite values (stat_ecdf).
new_traits = c(
"heart rate variability measurement",
"alcohol abuse",
"aging",
"blood pressure",
"BMI-adjusted waist circumference",
"body mass index",
"intelligence",
"self reported educational attainment",
"asthma",
"body height",
"mathematical ability",
"type ii diabetes mellitus",
"hair colour measurement",
"cognitive function measurement",
"schizophrenia",
"brain measurement",
"Alzheimer's disease",
"stroke",
"hair color",
"hypertension",
"Myopia",
"BMI-adjusted hip circumference",
"hair shape measurement",
"eye colour measurement",
"skin pigmentation",
"skin pigmentation measurement",
"HIV-1 infection",
"viral load"
)ridges_plot_d_rank = hits_filt %>%
# filter for `target_traits`
dplyr::filter(TRAIT %in% new_traits) %>%
# group by trait to take unique SNPs
dplyr::group_by(TRAIT) %>%
dplyr::distinct(SNP, .keep_all = T) %>%
dplyr::ungroup() %>%
# reverse order of traits to put `body height` at the top
dplyr::mutate(TRAIT_REV = factor(TRAIT, levels = rev(names(d_rank_pal)))) %>%
# plot
ggplot(aes(FST_PEGAS, TRAIT_REV, fill = TRAIT_REV, colour = TRAIT_REV)) +
geom_density_ridges(scale = 2,
bandwidth = 0.003,
calc_ecdf = TRUE,
quantiles = 0.5,
quantile_lines = T,
jittered_points = T,
point_shape = '|', alpha = 0.85, point_size = 2,
position = position_points_jitter(height = 0),
) +
scale_fill_manual(values = d_rank_pal) +
scale_colour_manual(values = darker(d_rank_pal)) +
guides(fill = "none", colour = "none") +
theme_cowplot() +
scale_y_discrete(expand = expansion(add = c(0.2, 2.3))) +
xlab(expression(italic(F[ST]))) +
ylab(NULL)
ridges_plot_d_rankecdf_plot_face_d_rank = hits_v_controls_full %>%
# filter for `target_traits`
dplyr::filter(TRAIT %in% new_traits) %>%
# take unique SNPs
dplyr::group_by(EFO_ID) %>%
dplyr::distinct(SNP, .keep_all = T) %>%
dplyr::ungroup() %>%
# order FST_PEGAS so that it's properly plotted by plotly
dplyr::arrange(EFO_ID, FST_PEGAS) %>%
# order
dplyr::mutate(TRAIT = factor(TRAIT, levels = names(d_rank_pal)),
PAL_TRAIT = factor(PAL_TRAIT, levels = names(d_rank_pal))) %>%
ggplot() +
stat_ecdf(aes(FST_PEGAS, colour = PAL_TRAIT)) +
scale_colour_manual(values = d_rank_pal) +
theme_cowplot(rel_small = 9/14) +
guides(colour = "none") +
theme(strip.text.x = element_text(size = 5)) +
xlab(expression(italic(F[ST]))) +
ylab("Cumulative Probability") +
facet_wrap(~TRAIT, nrow = 7, ncol = 4)
ecdf_plot_face_d_rank## Warning: Removed 20 rows containing non-finite values (stat_ecdf).
ggsave(here::here("docs/plots", clump_params, "20220314_ecdf_all_faceted_with_controls_d_rank.png"),
plot = ecdf_plot_face_d_rank,
device = "png",
width = 8,
height = 10,
units = "in",
dpi = 400)## Warning: Removed 20 rows containing non-finite values (stat_ecdf).
ecdf_plot_all_d_rank = hits_filt %>%
# filter for `target_traits`
dplyr::filter(TRAIT %in% new_traits) %>%
# take unique SNPs
dplyr::group_by(EFO_ID) %>%
dplyr::distinct(SNP, .keep_all = T) %>%
dplyr::ungroup() %>%
# order FST_PEGAS so that it's properly plotted by plotly
dplyr::arrange(EFO_ID, FST_PEGAS) %>%
# order
dplyr::mutate(TRAIT = factor(TRAIT, levels = names(d_rank_pal))) %>%
ggplot() +
stat_ecdf(aes(FST_PEGAS, colour = TRAIT)) +
scale_colour_manual(values = d_rank_pal) +
theme_cowplot() +
guides(colour = "none") +
theme(strip.text.x = element_text(size = 5)) +
xlab(expression(italic(F[ST]))) +
ylab("Cumulative Probability")
ecdf_plot_all_d_rankfinal_figure = cowplot::ggdraw() +
draw_plot(ridges_plot_d_rank,
x = 0, y = 0, width = .5, height = 1) +
draw_plot(ecdf_plot_face_d_rank,
x = .5, y = 0.35, width = .5, height = .65) +
draw_plot(ecdf_plot_all_d_rank,
x = .5, y = 0, width = .5, height = .35) +
draw_plot_label(label = c("A", "B", "C"), size = 25,
x = c(0, .5, .5), y = c(1, 1, .35),
hjust = c(-.15, .15, .15),
color = "black")## Warning: Removed 20 rows containing non-finite values (stat_ecdf).
width = 12
height = 12
ggsave(here::here("docs/plots", clump_params, "20220314_final.png"),
final_figure,
device = "png",
width = width,
height = height,
units = "in",
dpi= 400)
ggsave(here::here("docs/plots", clump_params, "20220314_final.pdf"),
final_figure,
device = "pdf",
width = width,
height = height,
units = "in",
dpi= 400)knitr::include_graphics(here::here("docs/plots", clump_params, "20220314_final.png"))